In [1]:
import os
NOVA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps_Noam/MOmaps'
NOVA_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps'
LOGS_PATH = os.path.join(NOVA_DATA_HOME, "outputs/preprocessing/spd/logs/dNLS")
PLOT_PATH = os.path.join(NOVA_HOME, 'src', 'preprocessing', 'notebooks','figures','dNLS')
NOVA_HOME = '/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA'
NOVA_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/MOmaps'
LOGS_PATH = os.path.join(NOVA_HOME, 'logs', 'dnls')
PLOT_PATH = os.path.join(NOVA_HOME, 'src', 'preprocessing', 'notebooks','figures','dnls_80pct')
os.chdir(NOVA_HOME)
import pandas as pd
import numpy as np
# plt.rcParams["image.cmap"] = "Set1"
import contextlib
import io
from IPython.display import display, Javascript
from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
plot_cell_count, plot_catplot, plot_hm_combine_batches, plot_hm, \
run_calc_hist_new
from tools.preprocessing_tools.qc_reports.qc_config import dnls_panels, dnls_markers, dnls_marker_info, dnls_cell_lines, \
dnls_cell_lines_to_cond, dnls_cell_lines_for_disp, reps, \
dnls_line_colors, dnls_lines_order, dnls_custom_palette, \
dnls_expected_dapi_raw, markers, custom_palette
%load_ext autoreload
%autoreload 2
In [14]:
# choose batches
batches = [f'batch{i}' for i in range(3,6)]
batches
Out[14]:
['batch3', 'batch4', 'batch5']
In [15]:
df = log_files_qc(LOGS_PATH, only_wt_cond=False, batches=batches)
# we are not using the wt line eventually
df = df[df.cell_line != 'WT']
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
# we need to match between the raw marker name (TDP43) and the processed marker name (TDP43N / TDP43B)
df_target.loc[df_target['marker'] == 'TDP43', 'marker'] += df_target['panel'].str.replace('panel', '')
reading logs of batch4 reading logs of batch3 reading logs of batch5 Total of 3 files were read. Before dup handeling (76919, 21) After duplication removal #1: (76919, 22) After duplication removal #2: (76919, 22)
Actual Files Validation¶
Raw Files Validation¶
- How many site tiff files do we have in each folder?
- Are all existing files valid? (tif, at least 2049kB, not corrupetd)
In [16]:
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'SpinningDisk','deltaNLS_sort')
raws = run_validate_folder_structure(root_directory_raw, False, dnls_panels, dnls_markers.copy(),PLOT_PATH, dnls_marker_info,
dnls_cell_lines_to_cond, reps, dnls_cell_lines_for_disp, dnls_expected_dapi_raw,
batches=batches, fig_width=3)
batch3 Folder structure is invalid. Missing 1 paths: /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch3/TDP43/panelN No bad files are found. Total Sites: 17200
======== batch4 Folder structure is invalid. Missing 1 paths: /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/raw/SpinningDisk/deltaNLS_sort/batch4/TDP43/panelN No bad files are found. Total Sites: 17200
======== batch5 Folder structure is valid. No bad files are found. Total Sites: 18000
======== ====================
Processed Files Validation¶
- How many site npy files do we have in each folder? -> How many sites survived the pre-processing?
- Are all existing files valid? (at least 100kB, npy not corrupted)
In [17]:
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'spd2',
'SpinningDisk','deltaNLS_80pct')
procs = run_validate_folder_structure(root_directory_proc, True, dnls_panels, dnls_markers,PLOT_PATH,dnls_marker_info,
dnls_cell_lines_to_cond, reps, dnls_cell_lines_for_disp, dnls_expected_dapi_raw,
batches=batches, fig_width=3)
batch3 Folder structure is invalid. Missing 4 paths: /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch3/TDP43/dox/TDP43N /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch3/TDP43/dox/TDP43B /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch3/TDP43/Untreated/TDP43N /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch3/TDP43/Untreated/TDP43B No bad files are found. Total Sites: 16748
======== batch4 Folder structure is invalid. Missing 4 paths: /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch4/TDP43/dox/TDP43N /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch4/TDP43/dox/TDP43B /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch4/TDP43/Untreated/TDP43N /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch4/TDP43/Untreated/TDP43B No bad files are found. Total Sites: 16634
======== batch5 Folder structure is invalid. Missing 4 paths: /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch5/TDP43/dox/TDP43N /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch5/TDP43/dox/TDP43B /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch5/TDP43/Untreated/TDP43N /home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/deltaNLS_80pct/batch5/TDP43/Untreated/TDP43B No bad files are found. Total Sites: 17125
======== ====================
Difference between Raw and Processed¶
In [18]:
display_diff(batches, raws, procs, PLOT_PATH, fig_width=3)
batch3
======== batch4
======== batch5
========
Variance in each batch (of processed files)¶
In [10]:
for batch in batches:
with contextlib.redirect_stdout(io.StringIO()):
var = sample_and_calc_variance(root_directory_proc, batch,
sample_size_per_markers=200, cond_count=2, rep_count=len(reps),
num_markers=len(dnls_markers))
print(f'{batch} var: ',var)
batch3 var: 0.010259739259552004 batch4 var: 0.010513111541071974 batch5 var: 0.010152732140884965
Preprocessing Filtering qc¶
By order of filtering
1. % site survival after Brenner on DAPI channel¶
Percentage out of the total sites
In [19]:
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, dnls_line_colors, dnls_panels, reps, figsize=(3,5))
2. % Site survival after Cellpose¶
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if Cellpose found 0 cells in it.
In [20]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner, dnls_line_colors,
dnls_panels, reps, figsize=(3,5))
3. % Site survival by tiling¶
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.
In [21]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose, dnls_line_colors, dnls_panels,
reps, figsize=(3,5))
4. % Site survival after Brenner on target channel¶
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).
In [22]:
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling, dnls_markers,figsize=(3,8))
Statistics About the Processed Files¶
In [23]:
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, dnls_markers)
Total tiles¶
In [24]:
markers_for_dnls = markers.copy()
markers_for_dnls.remove('TIA1')
markers_for_dnls += ['TDP43B']
total_sum[total_sum.marker.isin(markers_for_dnls)].n_valid_tiles.sum()
Out[24]:
468671
Total whole nuclei in tiles¶
In [25]:
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
Out[25]:
125177.0
Total nuclei in sites¶
In [26]:
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
Out[26]:
328744.0
In [27]:
show_total_sum_tables(total_sum)
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch3 | ||||
| count | 172.000000 | 172.000000 | 172.000000 | 172.00000 |
| mean | 1095.517442 | 10.955174 | 826.500000 | 2144.02907 |
| std | 125.228982 | 1.252290 | 86.795754 | 247.21836 |
| min | 816.000000 | 8.160000 | 624.000000 | 1548.00000 |
| 25% | 1014.000000 | 10.140000 | 782.500000 | 1961.00000 |
| 50% | 1089.500000 | 10.895000 | 822.000000 | 2160.50000 |
| 75% | 1196.000000 | 11.960000 | 896.000000 | 2320.75000 |
| max | 1328.000000 | 13.280000 | 1004.000000 | 2644.00000 |
| sum | 188429.000000 | NaN | 142158.000000 | 368773.00000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.00000 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch4 | ||||
| count | 172.000000 | 172.000000 | 172.000000 | 172.000000 |
| mean | 1021.773256 | 10.217733 | 750.988372 | 2005.116279 |
| std | 168.849844 | 1.688498 | 123.873874 | 326.967242 |
| min | 580.000000 | 5.800000 | 454.000000 | 1110.000000 |
| 25% | 909.000000 | 9.090000 | 655.000000 | 1788.000000 |
| 50% | 1037.000000 | 10.370000 | 758.500000 | 2039.000000 |
| 75% | 1138.000000 | 11.380000 | 841.000000 | 2202.750000 |
| max | 1328.000000 | 13.280000 | 995.000000 | 2614.000000 |
| sum | 175745.000000 | NaN | 129170.000000 | 344880.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch5 | ||||
| count | 180.000000 | 180.000000 | 180.000000 | 180.000000 |
| mean | 1027.938889 | 10.279389 | 769.911111 | 2013.633333 |
| std | 150.921977 | 1.509220 | 110.270986 | 299.418653 |
| min | 644.000000 | 6.440000 | 497.000000 | 1291.000000 |
| 25% | 924.250000 | 9.242500 | 698.500000 | 1771.000000 |
| 50% | 1061.000000 | 10.610000 | 786.000000 | 2064.000000 |
| 75% | 1129.000000 | 11.290000 | 845.000000 | 2196.000000 |
| max | 1301.000000 | 13.010000 | 1012.000000 | 2544.000000 |
| sum | 185029.000000 | NaN | 138584.000000 | 362454.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n valid tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| All batches | ||||
| count | 524.000000 | 524.000000 | 524.000000 | 5.240000e+02 |
| mean | 1048.097328 | 10.480973 | 782.274809 | 2.053639e+03 |
| std | 152.814238 | 1.528142 | 112.515547 | 2.993880e+02 |
| min | 580.000000 | 5.800000 | 454.000000 | 1.110000e+03 |
| 25% | 937.000000 | 9.370000 | 705.000000 | 1.846000e+03 |
| 50% | 1067.000000 | 10.670000 | 794.000000 | 2.102000e+03 |
| 75% | 1172.000000 | 11.720000 | 861.000000 | 2.267000e+03 |
| max | 1328.000000 | 13.280000 | 1012.000000 | 2.644000e+03 |
| sum | 549203.000000 | NaN | 409912.000000 | 1.076107e+06 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 4.500000e+02 |
Show Total Tile Counts¶
For each batch, cell line, replicate and markerTotal number of tiles
In [28]:
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
xlabel = 'Total number of tiles', show_sum=True, figsize=(3,8))
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
Show Total Whole Cell Counts¶
For each batch, cell line, replicate and markerTotal number of tiles
In [29]:
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
xlabel = 'Total number of whole cells', show_sum=True, figsize=(3,8))
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) /home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:381: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
Show Cell Count Statistics per Batch¶
In [30]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)')
plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site')
plot_cell_count(df_no_empty_sites, dnls_lines_order, dnls_custom_palette, y='site_cell_count',
title='Cellpose Cell Count Average per Site')
Show Tiles per Site Statistics¶
In [31]:
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
Out[31]:
cell_line_cond TDP43 Untreated 9.59600 TDP43 dox 11.36725 Name: n_valid_tiles, dtype: float64
In [32]:
df_dapi[['site_cell_count']].mean()
Out[32]:
site_cell_count 20.5465 dtype: float64
In [33]:
plot_catplot(df_dapi, custom_palette,reps, x='n_valid_tiles', x_title='valid tiles count', batch_min=3, batch_max=5, height=6)
/home/labs/hornsteinlab/Collaboration/NOVA_Oz/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1017: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep']
Show Mean of cell count in valid tiles¶
In [34]:
plot_hm(df_dapi, split_by='rep', rows='cell_line_cond', columns='panel', figsize=(10,3))
Assessing Staining Reproducibility and Outliers¶
In [35]:
dnls_cell_lines_for_disp
Out[35]:
{'TDP43_dox': 'TDP43_dox', 'TDP43_Untreated': 'TDP43_Untreated'}
In [36]:
for batch in batches:
print(batch)
run_calc_hist_new(f'{batch}', dnls_cell_lines_for_disp, dnls_markers,
root_directory_raw, root_directory_proc,
hist_sample=10,sample_size_per_markers=200, ncols=8, nrows=4, dnls=True)
print("="*30)
batch3
============================== batch4
============================== batch5
==============================
In [37]:
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html tools/preprocessing_tools/qc_reports/qc_report_dNLS_80pct.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/qc_report_dNLS_80pct.html')
usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
[--paths] [--json] [--debug]
[subcommand]
Jupyter: Interactive Computing
positional arguments:
subcommand the subcommand to launch
optional arguments:
-h, --help show this help message and exit
--version show the versions of core jupyter packages and exit
--config-dir show Jupyter config dir
--data-dir show Jupyter data dir
--runtime-dir show Jupyter runtime dir
--paths show all Jupyter paths. Add --json for machine-readable
format.
--json output paths as machine-readable json
--debug output debug information about paths
Available subcommands: kernel kernelspec migrate run troubleshoot
Jupyter command `jupyter-nbconvert` not found.
Out[37]:
256
In [ ]: